%%HTML
<style type="text/css">
table.dataframe td, table.datafreame th {
border: 1px black solid;
color: black;
}
df.shape
# get the basic information of the dataset
df.info()
#figure if there some null value will effect on our project
df.isnull().sum()
#filter and left the useful information only
df.drop(columns = ['tweet_id', 'name'], axis = 1, inplace = True)
df.columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
sid.polarity_scores(df.text[0])
df['polarity_score'] = df.text.apply(lambda x: sid.polarity_scores(x))
df.head()
df['score'] = df.polarity_score.apply(lambda x: x['compound'])
def find_sentiment(x):
if x > 0:
return "Positive"
elif x < 0:
return "Negative"
else:
return "Neutral"
df["sentiment"] = df.score.apply(lambda x: find_sentiment(x))
df.head()
df.drop(columns = ['polarity_score', 'score'], axis = 1, inplace = True)
df.head()
df.sentiment.value_counts()
sns.countplot(x = 'sentiment', data = df)
fig = go.Figure()
c = ['green','red','blue']
fig.add_trace(go.Bar(x = df.sentiment.value_counts().index,
y = df.sentiment.value_counts(),
marker_color = c,
text = df.sentiment.value_counts(),
textposition='auto'))
fig.update_layout(title = dict(text = "Sentiment count"))
In this section, according to key-words. Texts will fit into a related topic.
import spacy
nlp = spacy.load('en_core_web_sm')
def tokenization(s):
text = nlp(s.lower())
token = []
for t in text:
token.append(t.lemma_)
return token
df["lemma"] = df.text.apply(lambda x: tokenization(x))
df.head()
def stopwords(s):
l = []
for i in s:
if i not in nlp.Defaults.stop_words:
l.append(i)
return " ".join(l)
df['proccesed_text'] = df.lemma.apply(lambda x: stopwords(x))
from sklearn.feature_extraction.text import TfidfVectorizer
tdf = TfidfVectorizer(max_df=0.9, min_df=10)
dtm = tdf.fit_transform(df["proccesed_text"])
#LDA is applied in this part for machine learning
from sklearn.decomposition import ss
lda = LatentDirichletAllocation(n_components= 10, random_state= 0)
lda.fit(dtm)
for i, topic in enumerate(lda.components_):
print(f"Top 20 words for topic {i}")
print([tdf.get_feature_names()[index] for index in topic.argsort()[-20:]])
print("\n")
topic = lda.transform(dtm)
df["topic"] = topic.argmax(axis = 1)
df.head()
df.drop(columns=['lemma', 'proccesed_text'], axis =1, inplace = True)
df.head()
reason = df.negativereason.value_counts()
fig = go.Figure()
fig.add_trace(go.Bar(x = reason.index,
y = reason))
df.retweet_count.unique()
In order to anaylze compaint, we only use tweets with negatvie attitude according to the sentiment anaylze
df_neg = df[df['sentiment'] == 'Negative']
df_neg.head()
df_neg.to_csv (r'negativeTweets.csv', index = False, header=True)
import collections
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib import rcParams
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk import word_tokenize
# read the tweet from the data.
def removeAirline (userTyped):
tokens = word_tokenize(userTyped)
res = ''
if tokens[0] == '@':
tokens.pop(0)
tokens.pop(0)
res = TreebankWordDetokenizer().detokenize(tokens)
else:
res = userTyped
return res
# Collect the key-words from a certain reason.
# Each reason will having it's own word Cloud as output.
def wordCloud_res(reason):
df_res = df_neg[df_neg['negativereason'] == reason]
all_headlines = ' '
for line in df_res['text'].str.lower():
all_headlines += removeAirline(line)
stopwords = STOPWORDS
stopwords.add('will')
wordcloud = WordCloud(stopwords=stopwords, background_color="white", max_words=500).generate(all_headlines)
print('With the reason of: ' + reason)
rcParams['figure.figsize'] = 10, 20
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
reason = df_neg['negativereason'].unique()
reason
'Bad Flight', 'Late Flight','Customer Service Issue', 'Flight Booking Problems','Lost Luggage', 'Flight Attendant Complaints', 'Cancelled Flight', 'Damaged Luggage', 'longlines'
In the following steps, all the word clouds for each reason is shown
wordCloud_res(reason[0])
wordCloud_res(reason[3])
wordCloud_res(reason[4])
wordCloud_res(reason[5])
wordCloud_res(reason[6])
wordCloud_res(reason[7])
wordCloud_res(reason[8])
wordCloud_res(reason[9])
wordCloud_res(reason[10])